################################################################################
################################################################################
##### CREATE GROUP-LEVEL DATASET FOR USE IN REPLICATION CODE ###################
################################################################################
################################################################################

## Set directories appropriately
home.dir <- "."
data.dir <- paste0(home.dir, "/Data")
save.dir <- paste0(home.dir, "/R-output")
plot.dir <- paste0(save.dir, "/Plots")
out.dir <- paste0(save.dir, "/Out")

################################################################################
#### LIBRARIES #################################################################
################################################################################

library(rstan)
library(foreign)
library(parallel)
library(TeachingDemos)
library(ggplot2)
library(reshape2)
library(car)
library(plyr)
library(dplyr)
library(survey)
library(stringr)
library(data.table)
library(rms)

################################################################################
#### FUNCTIONS #################################################################
################################################################################

as.numchar <- function(x) as.numeric(as.character(x))
LoopProgress <- function(index, interval=1) {
    if (index %% interval == 0) print (index)
}
FileName <- function (path=NULL, name=NULL, ext=NULL, replace=FALSE) {
    p <- paste(path, collapse = "")
    n <- paste(name, collapse = "")
    e <- paste(".", ext, sep = "")
    d <- format(Sys.Date(), "%y%m%d") 
    for (l in seq_along(letters)) {
        if (l > 1) old_file_name <- file_name
        file_name <- paste0(p, d, paste(n, letters[l], sep="-"), e)
        if (!any(grepl(file_name, list.files()))) {
            if (replace) file_name <- ifelse(l > 1, old_file_name, file_name)
            break
        }
    }
    cat("\nFile name:", file_name, "\n\n")
    return(file_name)
}
anyValid <- function (x) any(!is.na(x))
POtoSouth11 <- function (pos, dta=st.info) {
    dta$South11[match(pos, dta$POAbrv)]
}
POtoSouth13 <- function (pos, dta=st.info) {
    dta$South13[match(pos, dta$POAbrv)]
}
NameToPO <- function (stnames, dta=st.info) {
    as.character(dta$POAbrv[match(tolower(stnames), tolower(dta$Name))])
}
POtoRegion4 <- function (pos, dta=st.info) {
    as.character(dta$REGION4[match(pos, dta$POAbrv)])
}
Interpolate <- function (last.yr, next.yr, this.yr, last.value, next.value) {
    w.l <- (next.yr - this.yr) / (next.yr - last.yr)
    w.n <- (this.yr - last.yr) / (next.yr - last.yr)
    out <- last.value * w.l + next.value * w.n
    return(out)
}
myRecode <- function(var, recode.ls, as.factor.result=TRUE, ...) {
    options(useFancyQuotes = FALSE)
    recode.ls <- sapply(recode.ls, function(x) paste(sQuote(x), collapse='='))
    recodes <- paste(recode.ls, collapse=';')
    print(recodes)
    revar <- Recode(var, recodes, as.factor.result=TRUE, ...)
    options(useFancyQuotes = TRUE)
    revar
}
mypdf <- function(name, ...) {
    date <- format(Sys.Date(), "%y%m%d")
    pdf(paste(paste(name, collapse=""), date, ".pdf", sep=""), ...)
}
mysw <- function (...) {
    setwd(paste(..., sep = "/"))
}
NAto0 <- function (x) {
    y <- x
    y[is.na(y)] <- 0
    return(y)
}
PasteEval <- function(..., print = FALSE) {
    txt <- paste(..., sep = "")
    if (print) { cat("\n", txt, "\n") }
    eval(parse(text = txt), envir = parent.frame())
}
Formula <- function (vars, y=NULL, text=FALSE) {
    Y <- paste(y, "~")
    vars <- paste(vars, collapse = " + ")
    if (text) return(paste(Y, vars))
    else return(as.formula(paste(Y, vars)))
}
grepSub <- function (obj, pattern, drop=FALSE, ...) {
    sub <- grepl(pattern, x=obj, ...)
    if (drop) sub <- !sub
    return (obj[sub])
}
RakePartial <- PasteEval(sub("compress = FALSE",
                             "compress = FALSE, partial=TRUE", deparse(rake)))

################################################################################
#### LOAD AND PREPARE DATA #####################################################
################################################################################

poll.set <- "modern"

## Load state-level data
mysw(data.dir, "state_data")
st.info <- read.csv("StateCodes.csv")
lower48 <- setdiff(as.character(st.info$POAbrv), c("AK", "HI", "DC"))
incom <- read.csv("states_income_real_19402010_yearly.csv") ## 1940
incom$StPOAbrv <- incom$abb
evang <- read.csv("states_evangelicals_19502010_yearly.csv") ## back to 1950
evang$StPOAbrv <- NameToPO(evang$state)
urban <- read.csv("states_urban_19502010_yearly.csv") ## 1950
urban$StPOAbrv <- urban$abb
union <- read.csv("states_union_19612010_yearly.csv") ## 1961
union$StPOAbrv <- NameToPO(union$state)
st.dta <- incom
st.dta <- merge(x=st.dta, y=evang, by=c("year", "StPOAbrv"), all.x=TRUE)
st.dta <- merge(x=st.dta, y=urban, by=c("year", "StPOAbrv"), all.x=TRUE)
st.dta <- merge(x=st.dta, y=union, by=c("year", "StPOAbrv"), all.x=TRUE)
st.dta <- plyr:::arrange(st.dta, year, StPOAbrv)
names(st.dta) <- gsub("percent", "prop", names(st.dta))
st.vars <- c("prop_evangelicals", "income_percapita", "prop_union",
             "prop_urban")
st.dta <- subset(st.dta,, c("year", "StPOAbrv", st.vars))
st.dta <- mutate(st.dta, REGION4=factor(POtoRegion4(StPOAbrv), exclude=""))
for (lev in levels(st.dta$REGION4)) {
    st.dta[lev] <- as.integer(st.dta$REGION4 == lev)
}
summary(st.dta)

constant.item <- as.integer(TRUE) ## difficulty parameters constant over time?
change.at.election <- as.integer(FALSE) ## change model at elections?
separate.years <- as.integer(FALSE) ## no smoothing over time?

### LOAD POLL DATA
q.subset <- "GovIsLib"
min.yr <- 1972
max.yr <- 2012
min.years <- 1
min.polls <- 1
mysw(data.dir)
load("GSS/GSS.RData")
poll.df <- read.dta("Survey Data for DIRT Model/all_survey_keep140428.dta")
q.info <- read.csv("items_dimension141105.csv", stringsAsFactors=FALSE)
q.cols <- grep("cces2006_minimumwage", names(poll.df)):ncol(poll.df)
q.vars <- names(poll.df)[q.cols]
stopifnot(all(q.vars %in% q.info$item))
if (q.subset == "GovIsLib") {
    poll.set <- "modern-GovIsLib"
    q.vars <- q.info$item[q.info$lib_is_gov == 1]
    poll.df <- poll.df[, !names(poll.df) %in%
                       q.info$item[q.info$lib_is_gov == 0]]
    gss.NN <- gss.NN.gov
    gss.SS <- gss.SS.gov
    gss.df.table <- gss.df.table.gov 
}
if (q.subset == "AllLib") {
    poll.set <- "modern-AllLib"
    q.vars <- q.info$item
    gss.NN <- gss.NN.all
    gss.SS <- gss.SS.all
    gss.df.table <- gss.df.table.all
}
## Rename GSS questions asked on other polls
## N.B. Collapsing these two variables introduces a small inconsistency with the
## data used in the paper
colnames(gss.NN) <- ifelse(colnames(gss.NN) == "gunlaw_binary",
                           "issue_gunpermit", colnames(gss.NN))
colnames(gss.NN) <- ifelse(colnames(gss.NN) == "cappun_binary",
                           "death_penalty", colnames(gss.NN))
colnames(gss.SS) <- colnames(gss.NN) <- paste0(colnames(gss.NN), '.gt0')
q.vars <- sort(setdiff(q.vars, "schoolprayer_amendment"))
gss.vars <- colnames(gss.NN)
print(c(q.vars, gss.vars))

drop.cces <- TRUE
if (drop.cces) {
    poll.df <- subset(poll.df, organization != "CCES")
}
## State PO abbreviation
table(poll.df$StPOAbrv <- factor(poll.df$abb, exclude=c("", "NA")))
stopifnot(all(levels(poll.df$StPOAbrv) %in% levels(st.info$POAbrv)))
poll.df <- subset(poll.df, StPOAbrv %in% levels(st.info$POAbrv))
gss.df.table$StPOAbrv <- gss.df.table$STATEAB
## Education
edu5.labels <- c("No High School Degree", "High School Grad",
                 "Some College", "College Graduate", "Graduate School")
edu4.labels <- c("No High School Degree", "High School Grad",
                 "Some College", "College Graduate")
gss.df.table <- mutate(gss.df.table,
                       Edu5=factor(education5, labels=edu5.labels),
                       Edu5=ordered(Edu5, levels=edu5.labels),
                       Edu4=Recode(Edu5, 'c("College Graduate",
                             "Graduate School")= "College Graduate"'),
                       Edu4=ordered(Edu4, levels=edu4.labels))
poll.df <- mutate(poll.df,
                  Edu5=factor(education, labels=edu5.labels),
                  Edu5=ordered(Edu5, levels=edu5.labels),
                  Edu4=Recode(Edu5, 'c("College Graduate",
                        "Graduate School")= "College Graduate"'),
                  Edu4=ordered(Edu4, levels=edu4.labels))
## Female
table(poll.df$Female <- factor(poll.df$gender, labels=c("Male", "Female")))
table(gss.df.table$Female <- factor(gss.df.table$gender,
                                    labels=c("Male", "Female")))
## Race
nlevels.race <- 3
race.labels <- c("White or Hispanic", "Black", "Other")
poll.df$Race3 <- factor(poll.df$race_new, labels=race.labels)
poll.df$Race2 <- "Non-Black"
poll.df$Race2 <- ifelse(poll.df$Race3 == "Black", "Black",
                        poll.df$Race2)
is.na(poll.df$Race2) <- is.na(poll.df$Race3)
poll.df$Race2 <- factor(poll.df$Race2)
## GSS
gss.df.table$Race3 <- factor(gss.df.table$race, labels=race.labels) 
gss.df.table$Race2 <- "Non-Black"
gss.df.table$Race2 <- ifelse(gss.df.table$Race3 == "Black", "Black",
                             gss.df.table$Race2)
is.na(gss.df.table$Race2) <- is.na(gss.df.table$Race3)
gss.df.table$Race2 <- factor(gss.df.table$Race2)
## PID
poll.df$PID3 <- ordered(poll.df$pid3, labels=c("D", "I", "R"))
## Year
poll.df <- mutate(poll.df, Year=as.integer(year),
                  ElecYear=ifelse(month < 11, Year, Year + 1),
                  ElecYear=ifelse(month == 11 & !grepl("NES", source),
                      NA, ElecYear))
gss.df.table <- mutate(gss.df.table, Year=as.integer(as.character(year)),
                       ElecYear=Year)
## weight variable
if (!"weight" %in% names(poll.df)) poll.df$weight <- 1
poll.df <- mutate(poll.df, weight=ifelse(weight > 0, weight, 1))
(fm.vars <- setdiff(names(poll.df), q.vars))
summary(poll.df)
summary(gss.df.table)
min.yr <- max(c(min.yr,
                min(min(poll.df$Year, na.rm=TRUE), min(gss.df.table$Year))))
max.yr <- min(c(max.yr,
                max(max(poll.df$Year, na.rm=TRUE), max(gss.df.table$Year))))
(yr.range <- min.yr:max.yr)

## Drop years outside year range
if (change.at.election) {
    poll.df <- mutate(poll.df, YearFactor=factor(ElecYear, yr.range))
    if (any(grepl("gss", ls()))) {
        gss.df.table <- mutate(gss.df.table,
                               YearFactor=factor(ElecYear, yr.range))
    }
} else {
    poll.df <- mutate(poll.df, YearFactor=factor(Year, yr.range))
    if (any(grepl("gss", ls()))) {
        gss.df.table <- mutate(gss.df.table, YearFactor=factor(Year, yr.range))
    }
}
fm.vars <- c(fm.vars, "YearFactor")
dim(poll.df <- subset(poll.df, !is.na(YearFactor)))
gc()
## Drop variables with no valid responses
valid.vars <- names(poll.df)[sapply(poll.df, anyValid)]
(q.vars <- sort(intersect(q.vars, valid.vars)))
dim(poll.df <- subset(poll.df,, valid.vars))
## Drop rows with no valid question responses
dim(poll.df <- subset(poll.df, rowSums(!is.na(poll.df[q.vars])) > 0))
q.vars <- sort(intersect(q.vars, names(poll.df)))
(fm.vars <- sort(intersect(fm.vars, names(poll.df))))
years.asked <- daply(data.frame(!is.na(poll.df)), ~poll.df$YearFactor, colSums)
(years.asked <- years.asked > 0)
(q.vars <- q.vars[colSums(years.asked[, q.vars], na.rm=TRUE) >= min.years])
poll.df <- subset(poll.df,, c(fm.vars, q.vars))
dim(poll.df)
gc()

if (any(grepl("gss", ls()))) {
    gss.years.asked <- ddply(data.frame(gss.df.table, gss.NN), ~YearFactor,
                             function (df) laply(df[gss.vars], sum))
    gss.years.asked[, -1] <- gss.years.asked[, -1] > 0
    colnames(gss.years.asked)[-1] <- colnames(gss.NN)
    years.asked.melt <-
        melt(merge(data.frame(years.asked[, q.vars],
                              YearFactor=as.integer(rownames(years.asked))),
                   gss.years.asked, by="YearFactor", all=TRUE), id="YearFactor")
} else {
    years.asked.melt <-
        melt(data.frame(years.asked[, q.vars],
                        YearFactor=as.integer(rownames(years.asked))),
             id="YearFactor")
}
poll.df$source.year <-
    interaction(poll.df$YearFactor, poll.df$source, drop=TRUE, lex.order=TRUE)
forms.asked <-
    daply(data.frame(!is.na(poll.df)), ~poll.df$source.year, colSums) > 0
forms.asked[, q.vars] <- 
    forms.asked[, q.vars] * apply(forms.asked[, q.vars], 1, sum)
q.vars <- q.vars[colSums(forms.asked[, q.vars], na.rm=TRUE) >= min.polls]
forms.asked.melt <- melt(forms.asked[, q.vars])
names(forms.asked.melt) <- c('Poll', 'Question', 'nQuestions') 

## Transform ordinal questions into binary above/below threshold indicators
for (q in seq_along(q.vars)) {
    LoopProgress(q, 10)
    levels.q <- levels(factor(poll.df[, q.vars[q]]))
    for (l in 1:(length(levels.q) - 1)) {
        poll.df[ncol(poll.df) + 1] <-
            as.integer(poll.df[, q.vars[q]] > levels.q[l])
        names(poll.df)[ncol(poll.df)] <-
            paste0(q.vars[q], ".gt", levels.q[l])
    }
}
(gt.vars <- sort(names(poll.df)[grep(".gt", names(poll.df))]))
gc()

### PREDICTOR VARIABLES
geo.var <- 'StPOAbrv'
demo.vars <- c("Race2")
pre.wt.vars.ls <- as.list(setdiff(c("Race2", "Female", "Edu4"), demo.vars))
geo.mod.vars <- NULL
geo.mod.prior.vars <- "prop_evangelicals0"
(covs <- c(geo.var, demo.vars))
covs.yr <- c(covs, "YearFactor")
pre.wt.vars.ls

### WEIGHTS
## Population targets
mysw(data.dir, "Targets")
target.df <- read.dta("Targets1960to2010-140228.dta")
## Eliminate 0's in population
target.df <- ddply(target.df, ~Year, mutate,
                   Prop = ifelse(Prop == 0, 1e-10, Prop),
                   Prop = Prop / sum(Prop))
if (min.yr < min(target.df$Year)) {
    for (yr in (min(target.df$Year) - 1):min.yr) {
        new.df <- mutate(subset(target.df, Year==min(target.df$Year)),
                         Year=yr)
        target.df <- rbind(new.df, target.df)
    }
}
if (max.yr > max(target.df$Year)) {
    for (yr in (max(target.df$Year + 1)):max.yr) {
        new.df <- mutate(subset(target.df, Year==max(target.df$Year)),
                         Year=yr)
        target.df <- rbind(target.df, new.df)
    }
}
target.df <- mutate(target.df,
                    Female=factor(ifelse(Female, "Female", "Male"),
                        levels(poll.df$Female)),
                    Race2=ifelse(race3 == '2', 'Black', 'Non-Black'),
                    Race2=factor(Race2, levels(poll.df$Race2)),
                    Edu5=factor(as.integer(education2), labels=edu5.labels),
                    Edu5=ordered(Edu5, levels=edu5.labels),
                    Edu4=Recode(Edu5, 'c("College Graduate",
                          "Graduate School") = "College Graduate"'),
                    Edu4=ordered(Edu4, levels=edu4.labels))
target.ds <- svydesign(~1, weights = ~Prop, data = target.df)

## Weight (PS or rake) w/in each group whose mean will be estimated
(pre.wt.form.ls <- llply(pre.wt.vars.ls, function (pwv) {
    vars <- c(pwv, demo.vars)
    vars <- unique(c(vars, geo.var))
    Formula(vars)
}))
(ff <- Formula(c("0", covs)))
(ff.yr <- Formula(c("0", covs.yr)))

## Rows with no missing weighting variables or covariates and >0 valid responses
valid.gt <- !is.na(poll.df[, gt.vars])
table(rowSums(valid.gt))
(wt.covs.yr <- unique(c(unlist(pre.wt.vars.ls), covs.yr)))
use.rows <- rowSums(is.na(poll.df[, wt.covs.yr])) == 0 & rowSums(valid.gt) >= 1
## Items used
gt.vars.use <- gt.vars[colSums(!is.na(poll.df[use.rows, gt.vars])) >= 1]
rm(list="valid.gt")
## Represent covariates in poll.df as model frame
MF <- model.frame(ff, data=poll.df[use.rows, ], na.action=return)
gc()
MF.yr <- model.frame(ff.yr, data=poll.df[use.rows, ], na.action=return)
gc()
## Factor with level for each combination of covariate values
group <- interaction(as.list(MF), sep="__", lex.order=FALSE)
group.yr <- interaction(as.list(MF.yr), sep="__", lex.order=FALSE)
## Matrix of every factor combination (reordered to match group factor order)
xtab.yr <- as.data.frame(table(MF.yr))
xtab <- subset(xtab.yr, YearFactor == yr.range[1], -c(YearFactor, Freq))
## Dummy variable representation of contingency table
XX <- model.matrix(ff, xtab)
## don't use hierarchical model if not pooling over time and only one predictor
if (separate.years==1 && length(covs)==1) XX <- matrix(0, nrow(XX), ncol(XX))
YY <- as.matrix(poll.df[use.rows, gt.vars.use])
table(n.responses <- rowSums(!is.na(YY), na.rm=TRUE))
gc()

## design effects
RakeWeight <- function (df, formula.list, target.design) {
    ds <- svydesign(~1, weights=~weight, data=df)
    pop.list <- llply(formula.list, svytable, design=target.design)
    rk <- RakePartial(ds, formula.list, pop.list)
    return (1/rk$prob)
}
de.df <- data.frame(row=1:sum(use.rows), Nresponses = n.responses, 
                    poll.df[use.rows, fm.vars])
## renormalize weights to be have mean of 1 within polls
de.df <- group_by(de.df, source, YearFactor) %>%
    mutate(weight = weight/mean(weight))
## In each year, create weights that match specified targets
de.df <- ddply(de.df, ~YearFactor, function (df) {
    ds <- subset(target.ds, target.ds$variables$Year==unique(df$YearFactor))
    df$new_weight <- RakeWeight(df, pre.wt.form.ls, ds)
    df <- mutate(df, new_weight = new_weight/mean(new_weight, na.rm=TRUE))
    return(df)
}, .progress = "text")
## calculate design effect within groups
de.df <- mutate(grouped_df(de.df, as.list(covs.yr)),
                de = 1 + (sd(new_weight)/mean(new_weight))^2,
                de = ifelse(is.na(de), 1, de))
de.df <- plyr:::arrange(de.df, row)
summary(de.df)
gc()

## weighted sample size
Nstar <- function (y, de, nr) {
    ob.adj <- as.numeric(!is.na(y)) / nr ## weight by inverse of num responses
    n.star <- ceiling(sum(ob.adj / de)) ## reduce N by design effect
    return(n.star)
}
Sstar <- function (y, de, nr, wt) {
    y.bar.star <- weighted.mean(y, wt/nr, na.rm=TRUE)
    s.star <- round(Nstar(y, de, nr) * y.bar.star, digits=0)
    return(s.star)
}

xtab.yr$xtrow <- 1:nrow(xtab.yr)

NN.dt <- aaply(YY, 2, function (q.var) {
    gdf <- grouped_df(data.frame(y = q.var, de.df), as.list(covs.yr),
                      drop = FALSE)
    sdf <- summarise(gdf, nstar = Nstar(y, de, Nresponses))
    mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
    mdf <- plyr:::arrange(mdf, xtrow)
    out <- mdf$nstar
    names(out) <- levels(group.yr)
    return(out)
}, .progress = "text")
gc()
NN <- t(as.data.frame(NN.dt))
gc()
SS.dt <- aaply(YY, 2, function (q.var) {
    gdf <- grouped_df(data.frame(y = q.var, de.df), as.list(covs.yr),
                      drop = FALSE)
    sdf <- summarise(gdf, sstar = Sstar(y, de, Nresponses, new_weight))
    mdf <- merge(x = xtab.yr, y = sdf, by = covs.yr, all.x = TRUE)
    mdf <- plyr:::arrange(mdf, xtrow)
    out <- mdf$sstar
    names(out) <- levels(group.yr)
    return(out)
}, .progress = "text")
SS <- t(as.data.frame(SS.dt))
gc()

if (any(grepl("gss", ls())) && all(covs %in% names(gss.df.table))) {
    ## collapse GSS data and merge with other poll data
    gss.vars.use <- gss.vars[laply(gss.years.asked[, gss.vars], any)]
    gss.NN.sub <- aaply(gss.NN[, gss.vars.use], 2, function (q.var) {
        daply(subset(data.frame(gss.df.table, q.var), Year %in% yr.range),
              Formula(covs.yr), function (df) sum(df$q.var))
    }, .progress = "text")
    gss.NN.sub <- t(as.data.frame(gss.NN.sub))
    gss.SS.sub <- aaply(gss.SS[, gss.vars.use], 2, function (q.var) {
        daply(subset(data.frame(gss.df.table, q.var), Year %in% yr.range),
              Formula(covs.yr), function (df) sum(df$q.var))
    }, .progress = "text")
    gss.SS.sub <- t(as.data.frame(gss.SS.sub))
    q.overlap <- intersect(colnames(NN), colnames(gss.NN.sub))
    for (i in seq_along(q.overlap)) {
        qo <- q.overlap[i]
        print(qo)
        NN[, qo] <- NAto0(NN[, qo]) + NAto0(gss.NN.sub[, qo])
        SS[, qo] <- NAto0(SS[, qo]) + NAto0(gss.SS.sub[, qo])
    }
    gss.vars.use <- setdiff(gss.vars.use, q.overlap)
    gss.NN.sub <- gss.NN.sub[, gss.vars.use]
    gss.SS.sub <- gss.SS.sub[, gss.vars.use]
    NN.cmb <- cbind(NN, gss.NN.sub)
    SS.cmb <- cbind(SS, gss.SS.sub)
    (qs.used <- c(gt.vars.use, gss.vars.use))
} else {
    NN.cmb <- NN
    SS.cmb <- SS
    (qs.used <- gt.vars.use)
}
NN.cmb <- NN.cmb[, qs.used]
SS.cmb <- SS.cmb[, qs.used]
## Replace NA with 0
NN.cmb[is.na(NN.cmb)] <- 0
SS.cmb[is.na(SS.cmb)] <- 0
stopifnot(all(SS.cmb - NN.cmb <= 0)) ## no more success than trials

## Years with surveys
(svy.yrs <- yr.range[laply(yr.range, function (yr) {
    any(NN.cmb[grepl(yr, rownames(NN.cmb)), ] > 0)
})])
svy.yr.range <- min(svy.yrs):max(svy.yrs)
two.digit.labels <- paste0("'", str_sub(unique(4*trunc(svy.yr.range/4)), 3, 4))

T <- length(svy.yr.range) ## some years may not have a question
Q <- length(qs.used)
G <- nlevels(group)
nat.vars <- demo.vars
demo.group <- interaction(as.list(xtab[, nat.vars, drop=FALSE]))
Gnat <- nlevels(demo.group)

## Weights
post.wt.vars <- covs.yr
target.ds <- update(target.ds, YearFactor=factor(Year, svy.yr.range))
xtab.ds <- svydesign(~1, probs=1, data=xtab.yr)
xtab.ps <- postStratify(xtab.ds, Formula(post.wt.vars),
                        svytable(Formula(post.wt.vars), target.ds))
nat.wts <- 1/xtab.ps$prob
WT <- array(nat.wts, dim=c(G, T, Gnat))
WT <- aperm(WT, c(2, 3, 1)) ## T x Gnat x G
for (i in seq_len(Gnat)) {
    (dg <- levels(demo.group)[i])
    WT[, i, !demo.group == dg] <- 0
    WT[, i, ] <- WT[, i, ] / rowSums(WT[, i, ])
    (WT[T, i, ] %*% XX)
}
apply(WT, 1, rowSums)

## National-only questions (none for paper replication)
NN.nat <- array(FALSE, dim=dim(NN.cmb), dimnames=dimnames(NN.cmb))
nat_only <- array(0, dim=c(T, Q), dimnames=list(svy.yr.range, qs.used))
(N <- base:::sum(NN.cmb > 0 & !NN.nat)) ## groups with data and not national-only

NNnat <- SSnat <- array(0, c(T, Q, Gnat),
                        list(svy.yr.range, qs.used, levels(demo.group)))
n_vec <- s_vec <- integer(N) ## Vectors of trials and successes
names(n_vec) <- names(s_vec) <- seq_len(N)
MMM <- array(1, dim=list(T, Q, G)) ## Missingness indicator array
pos <- 0
for (t in 1:T) {
    print(yr <- svy.yr.range[t])
    if (!yr %in% svy.yrs) next
    yr.obs <- grep(yr, rownames(NN.cmb)) 
    for (q in 1:Q) {
        if (nat_only[t, q]) { ## if nation only
            print(c(t, q))
            for (h in seq_len(Gnat)) {
                gn.obs <- demo.group == levels(demo.group)[h]
                (wt.mn <- weighted.mean(SS.cmb[yr.obs, q] /
                                        NN.cmb[yr.obs, q],
                                        WT[t, h, ], na.rm=TRUE))
                (NNnat[t, q, h] <- ceiling(sum(NN.cmb[yr.obs[gn.obs], q])))
                (SSnat[t, q, h] <- round(wt.mn * NNnat[t, q, h]))
            }
        } else {
            for (g in 1:G) {
                if (NN.cmb[yr.obs[g], q] == 0) next ## skip if no data
                pos <- pos + 1 
                MMM[t, q, g] <- 0 ## mark as not missing
                n_vec[pos] <- NN.cmb[yr.obs[g], q]
                s_vec[pos] <- SS.cmb[yr.obs[g], q]
                names(n_vec)[pos] <- names(s_vec)[pos] <-
                    paste(rownames(NN.cmb)[yr.obs[g]],
                          colnames(NN.cmb)[q], sep=" | ")
            }
        }
    }
}
stopifnot(identical(as.integer(pos), N))

gc()
quantile(n_vec, seq(.05, .95, .05))
table(n_vec)
table(s_vec)
mean(!MMM) ## N.B. slight inconsistency due to collapsing of gun questions

## Extrapolate years not covered by state file
yrs.before <- min(st.dta$year) - min.yr
yrs.after <- max.yr - max(st.dta$year)
if (yrs.before > 0) {
    rows.before <- which(st.dta$year == min(st.dta$year))
    rows.before <- rep(rows.before, times = yrs.before)
    tmp <- st.dta[rows.before, ]
    is.na(tmp[, st.vars]) <- TRUE
    tmp$year <- tmp$year -
        rep(yrs.before:1, each=sum(st.dta$year == min(st.dta$year)))
    st.dta <- rbind(tmp, st.dta)
}
if (yrs.after > 0) {
    rows.after <- which(st.dta$year == max(st.dta$year))
    rows.after <- rep(rows.after, times = yrs.after)
    tmp <- st.dta[rows.after, ]
    is.na(tmp[, st.vars]) <- TRUE
    tmp$year <- tmp$year +
        rep(1:yrs.after, each=sum(st.dta$year == max(st.dta$year)))
    st.dta <- rbind(st.dta, tmp)
}
for (v in seq_along(st.vars)) {
    (sv <- st.vars[v])
    (min.valid.yr <- min(st.dta$year[!is.na(st.dta[sv])]))
    min.valid.obs <- which(st.dta$year == min.valid.yr)
    (max.valid.yr <- max(st.dta$year[!is.na(st.dta[sv])]))
    max.valid.obs <- which(st.dta$year == max.valid.yr)
    if (min.yr < min.valid.yr) {
        for (i in seq_len(min.valid.yr - min.yr)) {
            st.dta[min.valid.obs - i*51, sv] <- st.dta[min.valid.obs, sv]
        }
    }
    if (max.yr > max.valid.yr) {
        for (i in seq_len(max.yr - max.valid.yr)) {
            st.dta[max.valid.obs + i*51, sv] <- st.dta[max.valid.obs, sv]
        }
    }
}
## standardize by year
st.dta <- ddply(st.dta, ~year, mutate,
                prop_evangelicals0=scale(prop_evangelicals),
                income_percapita0=scale(income_percapita),
                prop_union0=scale(prop_union),
                prop_urban0=scale(prop_urban))
summary(st.dta)

if (is.null(geo.mod.vars)) {
    zz.names <- list(svy.yr.range, levels(poll.df[, geo.var]),
                     "Placeholder Zero")
    ZZ <- array(data=0, dim=llply(zz.names, length), dimnames=zz.names)
} else {
    if (geo.var == "StPOAbrv") {
        ZZ <- melt(subset(st.dta,, -REGION4), id.vars=c("StPOAbrv", "year"))
        ZZ <- acast(ZZ, year ~ StPOAbrv ~ variable)
        ZZ <- ZZ[as.character(svy.yr.range),, geo.mod.vars, drop=FALSE]
        dim(ZZ)
    }
}
if (is.null(geo.mod.prior.vars)) {
    zz.prior.names <- list(svy.yr.range, levels(poll.df[, geo.var]),
                           "Placeholder Zero")
    ZZ.prior <- array(data=0, dim=llply(zz.prior.names, length),
                      dimnames=zz.prior.names)
    ZZ.prior <- ZZ.prior[as.character(min(svy.yr.range)),,, drop=FALSE]
} else {
    if (geo.var == "StPOAbrv") {
        ZZ.prior <- melt(subset(st.dta,, -REGION4),
                         id.vars=c("StPOAbrv", "year"))
        ZZ.prior <- acast(ZZ.prior, year ~ StPOAbrv ~ variable)
        ZZ.prior <- ZZ.prior[as.character(min(svy.yr.range)),,
                             geo.mod.prior.vars, drop=FALSE]
        dim(ZZ.prior)
    }
}

setwd("~/Dropbox/Shared/Work/Dynamic_MRP/Drafts/DIRT/Replication/LiberalismApplication")
save(list = setdiff(ls(), c('poll.df', 'de.df', 'YY', 'xtab', 'xtab.yr', 'MF',
         'MF.yr')), file = "LibDataForStan.RData")

